In [1]:
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))

from IPython.display import IFrame
In [2]:
import sys
sys.path.insert(0,'../')
from utils.paths import *
In [3]:
import pandas as pd
import numpy as np
# from plotly.offline import init_notebook_mode, iplot
# import cufflinks as cf
# init_notebook_mode()
# cf.go_offline()
from __future__ import division
In [4]:
def table(no):
    # there are 13 additional table
    return pd.read_csv(path_SBA + 't00%02d'%no + '-10.1080%2F10691898.2018.1434342.csv')
table(1)

Read data

!aws s3 ls s3://eh-home/ehda-calvin/SBA_study/
In [5]:
nat = pd.read_csv(path_SBA + 'SBAnational.csv', low_memory=False)
In [32]:
# Preprocessing
import preprocessing as pp
reload(pp)
Out[32]:
<module 'preprocessing' from '../preprocessing.pyc'>
In [7]:
%%time
nat = nat[nat.ApprovalFY != '1976A']
nat['ApprovalFY'] = nat.ApprovalFY.astype(int)
# nat = nat[(nat.ApprovalFY >= 1990) & pd.notnull(nat.MIS_Status)]
nat = nat[pd.notnull(nat.MIS_Status)]
nat = nat[pd.notnull(nat.Name)]

# convert to timestamp
nat['ApprovalDate'] = pd.to_datetime(nat['ApprovalDate'], errors = 'coerce')
nat['DisbursementDate'] = pd.to_datetime(nat['DisbursementDate'], errors = 'coerce')

# convert $ to float
nat['DisbursementGross'] = nat['DisbursementGross'].apply(pp.to_float)
nat['BalanceGross'] = nat['BalanceGross'].apply(pp.to_float)
nat['ChgOffPrinGr'] = nat['ChgOffPrinGr'].apply(pp.to_float)
nat['GrAppv'] = nat['GrAppv'].apply(pp.to_float)
nat['SBA_Appv'] = nat['SBA_Appv'].apply(pp.to_float)

# Default
nat['default'] = nat.MIS_Status.apply(pp.default)
CPU times: user 5min 32s, sys: 1.02 s, total: 5min 33s
Wall time: 5min 33s
In [8]:
%%time
# cleaning data
nat['Zip5d'] = nat.apply(lambda x: pp.zip_5d(x['Zip'], x['State']), axis = 1)
nat['Zip3d'] = nat.Zip5d.str[:3]
nat['LowDoc'] = nat.LowDoc.apply(pp.clean_LowDoc)
nat['SBA_ratio'] = nat.SBA_Appv / nat.GrAppv
nat['RevLineCr'] = nat.RevLineCr.apply(pp.clean_RevLineCr)
nat['Zip_length'] = nat.Zip.apply(lambda x: len(str(x)))
nat['RealEstate'] = nat.Term.apply(pp.RealEstate)
nat['NAICS_default_rate'] = nat.NAICS.apply(pp.naics_defaut_rate)
nat['NAICS_group'] = nat.NAICS.apply(pp.naics_sector)
nat['FranchiseCode'] = nat.FranchiseCode.apply(pp.franchise)
CPU times: user 2min 18s, sys: 1.06 s, total: 2min 19s
Wall time: 2min 19s
In [14]:
# fix missing state
nat = pp.fix_missing_state(nat)
In [19]:
nat['Name2'] = nat['Name'] + '|' + nat['State'] + '(' + nat['Zip5d'].astype(str) + ')'
In [21]:
nat.Name2.sample(10)
Out[21]:
16473     Maria G. Abbas dba Earth Spiri|MD(21093)
283659               THE MEDICINE SHOPPE|CA(95361)
433950                   Indocurrent LLC|NY(11735)
584702    CHAMPAK/SHARDA BHOJA DBA DAYSI|GA(31206)
737823              MUSICAL PRODUCTS INC|IL(60616)
270061    THE AFTROBABIES COLLECTION INC|CA(90056)
871669        FLIP-FLOP COLLECTION, INC.|CA(90015)
820860                            SUBWAY|CA(94107)
815350                    PLATO'S CLOSET|TN(37421)
3901      Mahmoud Abdalla dba Texas Used|TX(76016)
Name: Name2, dtype: object
In [22]:
nat.Name2.value_counts().head()
Out[22]:
WEST TEXAS LEE CO INC|TX(79329)             134
Meathead Movers, Inc.|CA(93401)              42
REGENT FLOORS, INC.|PA(15137)                42
ECKERT DRILLING & CONSTRUCTION|PA(15235)     27
WEST TEXAS LEE COMPANY INC|TX(79329)         26
Name: Name2, dtype: int64
In [23]:
nat.Name.value_counts().head()
Out[23]:
SUBWAY                 1269
QUIZNO'S SUBS           433
COLD STONE CREAMERY     366
QUIZNO'S                345
DOMINO'S PIZZA          328
Name: Name, dtype: int64

Create features using historical records

In [24]:
# Create a record of company applied for loan

loan_record = {}
for i in nat.Name2.unique():
    loan_record[i] = []
# len(loan_record.keys())

for i in range(len(nat)):
#     print i
    loan_record[nat.iloc[i].Name2].append(nat.iloc[i].ApprovalFY)

loan_record_df = pd.DataFrame([loan_record]).T
loan_record_df = loan_record_df.rename(columns = {0: 'loan_list'})
loan_record_df['loan_start'] = loan_record_df.loan_list.apply(lambda x: min(x))
loan_record_df['loan_record_dict'] = loan_record_df.loan_list.apply(lambda x: pd.Series(x).value_counts().to_dict())

loan_record_df = loan_record_df.reset_index()
loan_record_df = loan_record_df.rename(columns = {'index': 'Name2'})
loan_record_df = loan_record_df.drop(loan_record_df[pd.isnull(loan_record_df.Name2)].index)
loan_record_df['Name'] = loan_record_df['Name2'].apply(lambda x: x.split('|')[0])

loan_record_df['suffix'] = loan_record_df['Name'].apply(pp.company_suffix)

display(loan_record_df.head(10))

save_csv(loan_record_df, 'extra_company_info.csv')
Name2 loan_list loan_start loan_record_dict Name suffix
0 "216" PIANO BAR|AR(72764) [2002] 2002 {2002: 1} "216" PIANO BAR MARKET
1 "360" SALON & DAY SPA|WV(26101) [2002] 2002 {2002: 1} "360" SALON & DAY SPA NO SUFFIX
2 "821"|DE(19801) [1999] 1999 {1999: 1} "821" NO SUFFIX
3 "A CLASS ACT" BEAUTY SALON|NV(89102) [1989] 1989 {1989: 1} "A CLASS ACT" BEAUTY SALON SALON
4 "A LA MODE" ICE CREAM PARLOUR|FL(34221) [1997] 1997 {1997: 1} "A LA MODE" ICE CREAM PARLOUR NO SUFFIX
5 "A MARKET" NATURAL FOODS|NH(03103) [1992] 1992 {1992: 1} "A MARKET" NATURAL FOODS NO SUFFIX
6 "A SHEAR DELITE" STYLING CENTE|MS(39180) [1996] 1996 {1996: 1} "A SHEAR DELITE" STYLING CENTE NO SUFFIX
7 "A" COMPANY, INC.|ID(83705) [1990] 1990 {1990: 1} "A" COMPANY, INC. INC
8 "A" YOGURT ENTERPRISES|TX(79902) [1989] 1989 {1989: 1} "A" YOGURT ENTERPRISES ENTERPRISES
9 "AI" TERIYAKI HOUSE|WA(98003) [1995] 1995 {1995: 1} "AI" TERIYAKI HOUSE NO SUFFIX
<---- Saving csv file to s3 ---->
In [25]:
# Create a record of company default

nat_d = nat[nat.default == 1].reset_index(drop = True)

default_record = {}
for i in nat_d.Name2.unique():
    default_record[i] = []

for i in range(len(nat_d)):
    default_record[nat_d.iloc[i].Name2].append(nat_d.iloc[i].ApprovalFY)

default_record_df = pd.DataFrame([default_record]).T
default_record_df = default_record_df.rename(columns = {0: 'default_list'})
default_record_df['default_record_dict'] = default_record_df.default_list.apply(lambda x: pd.Series(x).value_counts().to_dict())

default_record_df = default_record_df.reset_index()
default_record_df = default_record_df.rename(columns = {'index': 'Name2'})
default_record_df = default_record_df.drop(default_record_df[pd.isnull(default_record_df.Name2)].index)
default_record_df['Name'] = default_record_df['Name2'].apply(lambda x: x.split('|')[0])

display(default_record_df.head(10))

save_csv(default_record_df, 'company_default_record.csv')
Name2 default_list default_record_dict Name
0 "ALEXANDRA G" SHRIMP TRAWLER|TX(77465) [2001] {2001: 1} "ALEXANDRA G" SHRIMP TRAWLER
1 "B" SWEET|NJ(07728) [1998] {1998: 1} "B" SWEET
2 "E" STREET AUTOMOTIVE REPAIR|CA(95816) [1987] {1987: 1} "E" STREET AUTOMOTIVE REPAIR
3 "FORE" SEASONS, LLC|OH(44087) [2003] {2003: 1} "FORE" SEASONS, LLC
4 "HOOT" GIBSON SALES CO.|TX(78754) [1988] {1988: 1} "HOOT" GIBSON SALES CO.
5 "I" TEMP HVAC|TX(75228) [2003] {2003: 1} "I" TEMP HVAC
6 "JOSEPH E. ELLIOTT, JR. DBA|CT(06511) [1994] {1994: 1} "JOSEPH E. ELLIOTT, JR. DBA
7 "LADY JADE"|LA(70815) [2001] {2001: 1} "LADY JADE"
8 "SANUS" BOSNIAN CAFE AND GRILL|MN(55904) [2003] {2003: 1} "SANUS" BOSNIAN CAFE AND GRILL
9 "SJSH" LLC|UT(84604) [2003] {2003: 1} "SJSH" LLC
<---- Saving csv file to s3 ---->
In [26]:
# Open saved loan records
from ast import literal_eval

extinf = pd.read_csv(path_SBA + 'extra_company_info.csv', sep = ';', low_memory=False)
extinf['loan_record_dict'] = extinf['loan_record_dict'].apply(literal_eval)
extinf[extinf.loan_record_dict.apply(lambda x: len(x.keys())) > 3].head()
Out[26]:
Name2 loan_list loan_start loan_record_dict Name suffix
951 121 MARKETING SERVICES GROUP I|MN(55374) [2000, 2001, 2001, 2002, 2004, 2005] 2000 {2000: 1, 2001: 2, 2002: 1, 2004: 1, 2005: 1} 121 MARKETING SERVICES GROUP I NO SUFFIX
8903 A TO Z MACHINING SERVICE|OK(74601) [1998, 1999, 1991, 1994] 1991 {1994: 1, 1991: 1, 1998: 1, 1999: 1} A TO Z MACHINING SERVICE SERVICE
13795 ABENTROTH DISTRIBUTING, INC.|ND(58223) [2000, 2001, 2004, 2004, 2005] 2000 {2000: 1, 2001: 1, 2004: 2, 2005: 1} ABENTROTH DISTRIBUTING, INC. INC
14361 ABRAHAM STEEL FABRICATION INC|CA(93401) [2007, 2008, 2004, 2005] 2004 {2008: 1, 2004: 1, 2005: 1, 2007: 1} ABRAHAM STEEL FABRICATION INC INC
14402 ABRASIVES, INC.|ND(58533) [1998, 1998, 1999, 2001, 1991, 1994, 1995] 1991 {1991: 1, 1994: 1, 1995: 1, 1998: 2, 1999: 1, ... ABRASIVES, INC. INC
In [27]:
# Open saved default records
default_record = pd.read_csv(path_SBA + 'company_default_record.csv', sep = ';', low_memory=False)
default_record['default_record_dict'] = default_record['default_record_dict'].apply(literal_eval)
default_record[default_record.default_record_dict.apply(lambda x: len(x.keys())) > 3].head()
Out[27]:
Name2 default_list default_record_dict Name
24311 CASH & CARRY WHOLESALE, INC.|MO(65802) [2001, 2002, 2003, 2004, 2004, 2005] {2001: 1, 2002: 1, 2003: 1, 2004: 2, 2005: 1} CASH & CARRY WHOLESALE, INC.
26882 CITRUS CAFE, INC|CA(92780) [2007, 2008, 2009, 2010] {2008: 1, 2009: 1, 2010: 1, 2007: 1} CITRUS CAFE, INC
40505 DUTCH BOYS CARPET CLEANING INC|WI(53070) [1998, 2002, 2003, 2005] {2002: 1, 2003: 1, 2005: 1, 1998: 1} DUTCH BOYS CARPET CLEANING INC
50125 FORGET ME NOT FLORIST|LA(71108) [2006, 2007, 2008, 2005] {2008: 1, 2005: 1, 2006: 1, 2007: 1} FORGET ME NOT FLORIST
136391 TONY COMPTON AND TERRIE COMPTO|MS(39114) [2009, 2010, 2003, 2003, 2006] {2009: 1, 2010: 1, 2003: 2, 2006: 1} TONY COMPTON AND TERRIE COMPTO

Create features

In [28]:
nat = nat.merge(extinf[['Name2', 'loan_start', 'loan_record_dict', 'suffix']] , how = 'left', on = 'Name2')

nat['Loan_age'] = nat.apply(lambda x: pp.loan_age(x['ApprovalFY'], x['loan_record_dict']), axis = 1)
nat['Previous_loan'] = nat.apply(lambda x: pp.previous_loan(x['ApprovalFY'], x['loan_record_dict']), axis = 1)

nat = nat.merge(default_record[['Name2', 'default_record_dict']] , how = 'left', on = 'Name2')

nat['default_times'] = nat.apply(lambda x: pp.default_times(x['ApprovalFY'], x['default_record_dict']), axis = 1)
In [29]:
nat[['Loan_age', 'Previous_loan', 'default_times']].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 897137 entries, 0 to 897136
Data columns (total 3 columns):
Loan_age         897137 non-null int64
Previous_loan    897137 non-null int64
default_times    897137 non-null int64
dtypes: int64(3)
memory usage: 27.4 MB
loan_record_df[loan_record_df.Suffix == 'NO SUFFIX'].Name.apply(lambda x: x.split()[-1].replace('.', '').upper()).value_counts().head(50)
In [34]:
nat.head()
Out[34]:
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY ... NAICS_default_rate NAICS_group Name2 loan_start loan_record_dict suffix Loan_age Previous_loan default_record_dict default_times
0 1000014003 ABC HOBBYCRAFT EVANSVILLE IN 47711 FIFTH THIRD BANK OH 451120 1997-02-28 1997 ... 23.0 44 ABC HOBBYCRAFT|IN(47711) 1997 {1997: 1} NO SUFFIX 0 0 NaN 0
1 1000024006 LANDMARK BAR & GRILLE (THE) NEW PARIS IN 46526 1ST SOURCE BANK IN 722410 1997-02-28 1997 ... 22.0 72 LANDMARK BAR & GRILLE (THE)|IN(46526) 1997 {1997: 1} NO SUFFIX 0 0 NaN 0
2 1000034009 WHITLOCK DDS, TODD M. BLOOMINGTON IN 47401 GRANT COUNTY STATE BANK IN 621210 1997-02-28 1997 ... 10.0 62 WHITLOCK DDS, TODD M.|IN(47401) 1997 {1997: 1} NO SUFFIX 0 0 NaN 0
3 1000044001 BIG BUCKS PAWN & JEWELRY, LLC BROKEN ARROW OK 74012 1ST NATL BK & TR CO OF BROKEN OK 0 1997-02-28 1997 ... NaN 0 BIG BUCKS PAWN & JEWELRY, LLC|OK(74012) 1997 {1997: 1} LLC 0 0 NaN 0
4 1000054004 ANASTASIA CONFECTIONS, INC. ORLANDO FL 32801 FLORIDA BUS. DEVEL CORP FL 0 1997-02-28 1997 ... NaN 0 ANASTASIA CONFECTIONS, INC.|FL(32801) 1997 {1997: 1} INC 0 0 NaN 0

5 rows × 43 columns

Exploratory data analysis

In [ ]:
nat.GrAppv.iplot(kind = 'hist', bins = 100, title = 'Histogram of grant approved', xTitle = 'Grant')
In [ ]:
# No. of loan each year
nat.groupby('ApprovalFY').count().max(1).iplot(kind = 'bar', title = 'Loan cases')

Default rate

In [ ]:
(nat[nat.default == 1].groupby('ApprovalFY').count().max(1) / 
 nat.groupby('ApprovalFY').count().max(1)).iplot(kind = 'bar', title = 'Default rate')
In [ ]:
nat[nat.default == 1].groupby('ApprovalFY').SBA_Appv.sum().iplot(kind = 'bar')
In [ ]:
nat[nat.default == 1].ChgOffPrinGr.iplot(kind = 'hist', bins = 50)
In [ ]:
nat.head().T
In [ ]:
nat.State.value_counts().head()
In [ ]:
# default rate
nat['default'].sum() / len(nat)
In [ ]:
nat.default.value_counts()

LowDoc

LowDoc (Y = Yes, N = No): In order to process more loans efficiently, a “LowDoc Loan” program was implemented where loans under $150,000 can be processed using a one-page application. “Yes” indicates loans with a one-page application, and “No” indicates loans with more information attached to the application. In this dataset, 87.31% are coded as N (No) and 12.31% as Y (Yes) for a total of 99.62%. It is worth noting that 0.38% have other values (0, 1, A, C, R, S); these are data entry errors.

In [ ]:
nat.LowDoc.sample(10)

NAICS

NAICS (North American Industry Classification System): This is a 2- through 6-digit hierarchical classification system used by Federal statistical agencies in classifying business establishments for the collection, analysis, and presentation of statistical data describing the U.S. economy. The first two digits of the NAICS classification represent the economic sector.

In [ ]:
table(3)
In [ ]:
nat.NAICS.value_counts().head()
In [ ]:
# nat.NAICS.apply(lambda x: len(str(x))).value_counts()

    
In [ ]:
nat['NAICS_group'].value_counts().sort_index()

Loans Backed by Real Estate

Whether a loan is backed by real estate (possession of land) is another risk indicator that is discussed. The rationale for this indicator is that the value of the land is often large enough to cover the amount of any principal outstanding, thereby reducing the probability of default.

Since the term of the loan is a function of the expected lifetime of the assets, loans backed by real estate will have terms 20 years or greater (≥240 months) and are the only loans granted for such a long term, whereas loans not backed by real estate will have terms less than 20 years (<240 months). Therefore, the authors created a dummy variable, “RealEstate,” where “RealEstate” = 1 if “Term” ≥240 months and “RealEstate” = 0 if “Term” <240 months.

In [29]:
nat.head()
Out[29]:
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY ... DisbursementGross BalanceGross MIS_Status ChgOffPrinGr GrAppv SBA_Appv default NAICS_group NAICS_default_rate RealEstate
0 1000014003 ABC HOBBYCRAFT EVANSVILLE IN 47711 FIFTH THIRD BANK OH 451120 1997-02-28 1997 ... 60000.0 0.0 P I F 0.0 60000.0 48000.0 0 44 23.0 0
1 1000024006 LANDMARK BAR & GRILLE (THE) NEW PARIS IN 46526 1ST SOURCE BANK IN 722410 1997-02-28 1997 ... 40000.0 0.0 P I F 0.0 40000.0 32000.0 0 72 22.0 0
2 1000034009 WHITLOCK DDS, TODD M. BLOOMINGTON IN 47401 GRANT COUNTY STATE BANK IN 621210 1997-02-28 1997 ... 287000.0 0.0 P I F 0.0 287000.0 215250.0 0 62 10.0 0
3 1000044001 BIG BUCKS PAWN & JEWELRY, LLC BROKEN ARROW OK 74012 1ST NATL BK & TR CO OF BROKEN OK 0 1997-02-28 1997 ... 35000.0 0.0 P I F 0.0 35000.0 28000.0 0 0 NaN 0
4 1000054004 ANASTASIA CONFECTIONS, INC. ORLANDO FL 32801 FLORIDA BUS. DEVEL CORP FL 0 1997-02-28 1997 ... 229000.0 0.0 P I F 0.0 229000.0 229000.0 0 0 NaN 1

5 rows × 31 columns

SBA's Guaranteed Portion of Approved Loan

The portion which is the percentage of the loan that is guaranteed by SBA (represented as “Portion” in the dataset) is a final risk indicator that is discussed in our courses. This is one of the variables that the authors generated calculating the ratio of the amount of the loan SBA guarantees and the gross amount approved by the bank (SBA_Appv/GrAppv)

In [30]:
nat.SBA_ratio.iplot(kind = 'hist', bins = 10, title = 'Histogram of SBA approved ratio', xTitle = 'SBA_ratio')
In [31]:
nat[nat.default == 1].SBA_ratio.iplot(kind = 'hist', bins = 10)
In [32]:
nat.groupby('default').SBA_ratio.mean()
Out[32]:
default
0    0.718599
1    0.632673
Name: SBA_ratio, dtype: float64
In [33]:
nat.head().T
Out[33]:
0 1 2 3 4
LoanNr_ChkDgt 1000014003 1000024006 1000034009 1000044001 1000054004
Name ABC HOBBYCRAFT LANDMARK BAR & GRILLE (THE) WHITLOCK DDS, TODD M. BIG BUCKS PAWN & JEWELRY, LLC ANASTASIA CONFECTIONS, INC.
City EVANSVILLE NEW PARIS BLOOMINGTON BROKEN ARROW ORLANDO
State IN IN IN OK FL
Zip 47711 46526 47401 74012 32801
Bank FIFTH THIRD BANK 1ST SOURCE BANK GRANT COUNTY STATE BANK 1ST NATL BK & TR CO OF BROKEN FLORIDA BUS. DEVEL CORP
BankState OH IN IN OK FL
NAICS 451120 722410 621210 0 0
ApprovalDate 1997-02-28 00:00:00 1997-02-28 00:00:00 1997-02-28 00:00:00 1997-02-28 00:00:00 1997-02-28 00:00:00
ApprovalFY 1997 1997 1997 1997 1997
Term 84 60 180 60 240
NoEmp 4 2 7 2 14
NewExist 2 2 1 1 1
CreateJob 0 0 0 0 7
RetainedJob 0 0 0 0 7
FranchiseCode 1 1 1 1 1
UrbanRural 0 0 0 0 0
RevLineCr N N N N N
LowDoc 1 1 0 1 0
ChgOffDate NaN NaN NaN NaN NaN
DisbursementDate 1999-02-28 00:00:00 1997-05-31 00:00:00 1997-12-31 00:00:00 1997-06-30 00:00:00 1997-05-14 00:00:00
DisbursementGross 60000 40000 287000 35000 229000
BalanceGross 0 0 0 0 0
MIS_Status P I F P I F P I F P I F P I F
ChgOffPrinGr 0 0 0 0 0
GrAppv 60000 40000 287000 35000 229000
SBA_Appv 48000 32000 215250 28000 229000
default 0 0 0 0 0
NAICS_group 44 72 62 0 0
NAICS_default_rate 23 22 10 NaN NaN
RealEstate 0 0 0 0 1
SBA_ratio 0.8 0.8 0.75 0.8 1

RevLineCr

Revolving Line of Credit : Y = Yes

In [34]:
nat.RevLineCr.value_counts()
Out[34]:
N    391402
0    257431
Y    200631
T     15239
1        23
R        14
`        11
2         6
C         2
5         1
7         1
4         1
-         1
A         1
Q         1
3         1
,         1
.         1
Name: RevLineCr, dtype: int64

Zip code

A ZIP Code is a postal code used by the United States Postal Service (USPS) in a system it introduced in 1963.

The first digit of the ZIP Code is allocated as follows:

0 = Connecticut (CT), Massachusetts (MA), Maine (ME), New Hampshire (NH), New Jersey (NJ), New York (NY, Fishers Island only), Puerto Rico (PR), Rhode Island (RI), Vermont (VT), Virgin Islands (VI), Army Post Office Europe (AE), Fleet Post Office Europe (AE) 1 = Delaware (DE), New York (NY), Pennsylvania (PA) 2 = District of Columbia (DC), Maryland (MD), North Carolina (NC), South Carolina (SC), Virginia (VA), West Virginia (WV) 3 = Alabama (AL), Florida (FL), Georgia (GA), Mississippi (MS), Tennessee (TN), Army Post Office Americas (AA), Fleet Post Office Americas (AA) 4 = Indiana (IN), Kentucky (KY), Michigan (MI), Ohio (OH) 5 = Iowa (IA), Minnesota (MN), Montana (MT), North Dakota (ND), South Dakota (SD), Wisconsin (WI) 6 = Illinois (IL), Kansas (KS), Missouri (MO), Nebraska (NE) 7 = Arkansas (AR), Louisiana (LA), Oklahoma (OK), Texas (TX) 8 = Arizona (AZ), Colorado (CO), Idaho (ID), New Mexico (NM), Nevada (NV), Utah (UT), Wyoming (WY) 9 = Alaska (AK), American Samoa (AS), California (CA), Guam (GU), Hawaii (HI), Marshall Islands (MH), Federated States of Micronesia (FM), Northern Mariana Islands (MP), Oregon (OR), Palau (PW), Washington (WA), Army Post Office Pacific (AP), Fleet Post Office Pacific (AP)

The next two digits represent the sectional center facility (SCF) (e.g. 477xx = Vanderburgh County, Indiana), and the fourth and fifth digits represent the area of the city (if in a metropolitan area), or a village/town (outside metro areas): 47722 (4=Indiana, 77=Vanderburgh County, 22=University of Evansville area). When a sectional center facility's area crosses state lines, that facility is assigned separate three-digit prefixes for the states that it serves.

(https://en.wikipedia.org/wiki/ZIP_Code)

Connecticut (CT) Massachusetts (MA) Maine (ME) New Hampshire (NH) New Jersey (NJ) New York (NY) Puerto Rico (PR) Rhode Island (RI) Vermont (VT) Virgin Islands (VI) Army Post Office Europe (AE) Fleet Post Office Europe (AE)

'CT','MA','ME','NH','NJ','NY','PR','RI','VT','VI','AE','AE'

In [36]:
nat['Zip_length'].value_counts()
Out[36]:
5    777484
4     91392
1       377
3        33
2         2
Name: Zip_length, dtype: int64

Seem like some of the zip code has missing values!

In [38]:
nat[nat['Zip5d'] == '99999'].shape
Out[38]:
(828, 34)
In [40]:
nat['Zip5d'].value_counts().head()
Out[40]:
10001    924
90015    918
99999    828
93401    744
90010    732
Name: Zip5d, dtype: int64
In [41]:
nat['Zip3d'].value_counts().head()
Out[41]:
900    12286
770    10073
840     7916
750     7824
917     6962
Name: Zip3d, dtype: int64
In [42]:
nat = nat[['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'NAICS', 'NAICS_group', 
           'NAICS_default_rate', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 
           'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross', 
           'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'default', 'RealEstate', 'SBA_ratio']]
In [46]:
pd.isnull(nat).sum()
Out[46]:
LoanNr_ChkDgt              0
Name                      12
City                       0
State                      8
Zip5d                      0
Zip3d                      0
Bank                    1037
BankState               1043
NAICS                      0
NAICS_group                0
NAICS_default_rate    181207
ApprovalDate               0
ApprovalFY                 0
Term                       0
NoEmp                      0
NewExist                 134
CreateJob                  0
RetainedJob                0
FranchiseCode              0
UrbanRural                 0
RevLineCr                  0
LowDoc                  5999
ChgOffDate            714886
DisbursementDate        2011
DisbursementGross          0
BalanceGross               0
MIS_Status                 0
ChgOffPrinGr               0
GrAppv                     0
SBA_Appv                   0
default                    0
RealEstate                 0
SBA_ratio                  0
dtype: int64

Saving datasets

In [37]:
!aws s3 ls --human-readable s3://eh-home/ehda-calvin/SBA_study/
                           PRE pdf/
2018-10-29 08:48:12   15.8 KiB 7a_504_FOIA Data Dictionary.xlsx
2018-10-29 08:48:13   28.9 MiB FOIA - 504 (FY1991-Present).xlsx
2018-10-29 08:48:15   52.6 MiB FOIA - 7(a)(FY1991-FY1999).xlsx
2018-10-29 08:48:17  111.1 MiB FOIA - 7(a)(FY2000-FY2009).xlsx
2018-10-29 08:48:19   80.8 MiB FOIA - 7(a)(FY2010-Present).xlsx
2018-10-26 03:13:21  171.1 MiB SBAnational.csv
2018-11-01 07:21:49  232.1 MiB SBAnational_new.csv
2018-11-01 07:11:23   10.4 MiB company_default_record.csv
2018-11-01 07:03:29   67.9 MiB extra_company_info.csv
2018-10-29 03:22:11    9.6 MiB loan_record.csv
2018-10-29 02:42:40    1.4 KiB t0001-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:41  772 Bytes t0002-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:42  785 Bytes t0003-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:43  913 Bytes t0004-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:43  202 Bytes t0005-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:44  203 Bytes t0006-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:45  289 Bytes t0007-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:46  408 Bytes t0008-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:47  204 Bytes t0009-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:48  294 Bytes t0010-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:49  250 Bytes t0011-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:50  316 Bytes t0012-10.1080%2F10691898.2018.1434342.csv
2018-10-29 02:42:50    2.3 KiB t0013-10.1080%2F10691898.2018.1434342.csv
In [35]:
save_csv(nat, 'SBAnational_new.csv')
<---- Saving csv file to s3 ---->

Extract information from extra dataset

In [7]:
df = pd.DataFrame()
for f in ['FOIA - 7(a)(FY1991-FY1999).xlsx', 
          'FOIA - 7(a)(FY2000-FY2009).xlsx', 
          'FOIA - 7(a)(FY2010-Present).xlsx']:
    temp = pd.read_excel(path_SBA + f)
    temp = temp[(pd.notnull(temp.BorrName)) & (pd.notnull(temp.BorrState)) & 
                (pd.notnull(temp.BorrZip)) & (temp.Program == '7A')]
    temp = temp[['BorrName', 'BorrState', 'BorrZip', 'BusinessType']]
    temp['Name2'] = temp['BorrName'] + '|' + temp['BorrState'] + '(' + temp['BorrZip'].astype(str) + ')'
    display(temp.head())
    df = pd.concat([df, temp])

df.shape
BorrName BorrState BorrZip BusinessType Name2
0 NEW ERA, INC. TN 37041 CORPORATION NEW ERA, INC.|TN(37041)
1 M.C. INTERNATIONAL, INC TN 37207 CORPORATION M.C. INTERNATIONAL, INC|TN(37207)
2 TECHNIGRAPHICS UT 84119 PARTNERSHIP TECHNIGRAPHICS|UT(84119)
3 GREEN VALLEY GROCERY/CRAWFORD NV 89103 CORPORATION GREEN VALLEY GROCERY/CRAWFORD|NV(89103)
4 Michael Davis WI 54936 CORPORATION Michael Davis|WI(54936)
BorrName BorrState BorrZip BusinessType Name2
0 CREATIVE SUNROOM DESIGNS INC IN 46256 CORPORATION CREATIVE SUNROOM DESIGNS INC|IN(46256)
1 Tan Nhu Tieu TX 75023 CORPORATION Tan Nhu Tieu|TX(75023)
2 DESIGNER CHOICE CA 90016 INDIVIDUAL DESIGNER CHOICE|CA(90016)
3 K & L AUTOMOTIVE REPAIR INC NJ 7065 CORPORATION K & L AUTOMOTIVE REPAIR INC|NJ(7065)
4 RANDY'S STEAK HOUSE TX 75034 CORPORATION RANDY'S STEAK HOUSE|TX(75034)
BorrName BorrState BorrZip BusinessType Name2
0 CRESA PARTNERS - DENVER, INC. CO 80237 CORPORATION CRESA PARTNERS - DENVER, INC.|CO(80237)
1 The Hilltop Tavern CA 95819 CORPORATION The Hilltop Tavern|CA(95819)
2 River City Car Wash LLC CA 95691 CORPORATION River City Car Wash LLC|CA(95691)
3 Alphagraphics CT 6810 CORPORATION Alphagraphics|CT(6810)
4 ON SITE AUTOMOTIVE APPEARANCE IN 46540 INDIVIDUAL ON SITE AUTOMOTIVE APPEARANCE|IN(46540)
Out[7]:
(1521154, 5)
In [13]:
df = df[pd.notnull(df.BusinessType)]
In [14]:
print df.shape
df = df.drop_duplicates('Name2', keep = 'last')
print df.shape
(1521109, 5)
(1339983, 5)
In [15]:
df.head()
Out[15]:
BorrName BorrState BorrZip BusinessType Name2
0 NEW ERA, INC. TN 37041 CORPORATION NEW ERA, INC.|TN(37041)
1 M.C. INTERNATIONAL, INC TN 37207 CORPORATION M.C. INTERNATIONAL, INC|TN(37207)
2 TECHNIGRAPHICS UT 84119 PARTNERSHIP TECHNIGRAPHICS|UT(84119)
3 GREEN VALLEY GROCERY/CRAWFORD NV 89103 CORPORATION GREEN VALLEY GROCERY/CRAWFORD|NV(89103)
4 Michael Davis WI 54936 CORPORATION Michael Davis|WI(54936)
In [16]:
save_csv(df, 'company_business_type.csv')
<---- Saving csv file to s3 ---->

Create a Zip to Fips dictionary

In [17]:
# converting ZIP code to FIPS code
link1 = 'https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt'
ztf = pd.read_csv(link1, usecols = ['ZCTA5','STATE','COUNTY'], dtype = str)
ztf = ztf.rename(columns = {'ZCTA5': 'zip'})
ztf['fips'] = ztf.STATE + ztf.COUNTY
ztf = ztf[['zip', 'fips']]
ztf1 = ztf.drop_duplicates('zip', keep='first')
ztf1.shape
Out[17]:
(33120, 2)
In [20]:
ztf1.sample(10)
Out[20]:
zip fips
29798 63336 29163
32563 68520 31109
41484 92405 06071
19305 45432 39057
24617 54813 55005
39089 82834 56019
1342 04555 23015
19216 45332 39037
24654 54843 55113
35937 75656 48067
In [59]:
save_csv(ztf1, 'ztf1.csv')
<---- Saving csv file to s3 ---->

Merge Datasets

In [43]:
nat = pd.read_csv(path_SBA + 'SBAnational_new.csv', low_memory=False, sep = ';',
                 dtype = {'Zip': str, 'Zip5d': str, 'Zip3d': str})
In [54]:
nat = nat.merge(ztf1, how = 'left', left_on = 'Zip5d', right_on = 'zip')
In [63]:
nat.sample(5)[['Zip', 'Zip3d', 'Zip5d', 'zip', 'fips']]
Out[63]:
Zip Zip3d Zip5d zip fips
813465 5736 057 05736 05736 50021
485531 5450 054 05450 05450 50011
72339 19311 193 19311 19311 42029
633333 43065 430 43065 43065 39041
863448 83702 837 83702 83702 16001
In [58]:
print pd.isnull(nat.fips).sum(), pd.notnull(nat.fips).sum()
23007 874130
In [61]:
nat.Name2.isin(df.Name2).sum()
Out[61]:
705872
In [60]:
df.head()
Out[60]:
BorrName BorrState BorrZip BusinessType Name2
0 NEW ERA, INC. TN 37041 CORPORATION NEW ERA, INC.|TN(37041)
1 M.C. INTERNATIONAL, INC TN 37207 CORPORATION M.C. INTERNATIONAL, INC|TN(37207)
2 TECHNIGRAPHICS UT 84119 PARTNERSHIP TECHNIGRAPHICS|UT(84119)
3 GREEN VALLEY GROCERY/CRAWFORD NV 89103 CORPORATION GREEN VALLEY GROCERY/CRAWFORD|NV(89103)
4 Michael Davis WI 54936 CORPORATION Michael Davis|WI(54936)
In [66]:
print nat.shape
nat = nat.merge(df[['Name2', 'BusinessType']] , how = 'left', on = 'Name2')
print nat.shape
(897137, 45)
(897137, 46)
In [67]:
nat.head()
Out[67]:
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY ... loan_start loan_record_dict suffix Loan_age Previous_loan default_record_dict default_times zip fips BusinessType
0 1000014003 ABC HOBBYCRAFT EVANSVILLE IN 47711 FIFTH THIRD BANK OH 451120 1997-02-28 1997 ... 1997 {1997: 1} NO SUFFIX 0 0 NaN 0 47711 18163 INDIVIDUAL
1 1000024006 LANDMARK BAR & GRILLE (THE) NEW PARIS IN 46526 1ST SOURCE BANK IN 722410 1997-02-28 1997 ... 1997 {1997: 1} NO SUFFIX 0 0 NaN 0 46526 18039 CORPORATION
2 1000034009 WHITLOCK DDS, TODD M. BLOOMINGTON IN 47401 GRANT COUNTY STATE BANK IN 621210 1997-02-28 1997 ... 1997 {1997: 1} NO SUFFIX 0 0 NaN 0 47401 18013 INDIVIDUAL
3 1000044001 BIG BUCKS PAWN & JEWELRY, LLC BROKEN ARROW OK 74012 1ST NATL BK & TR CO OF BROKEN OK 0 1997-02-28 1997 ... 1997 {1997: 1} LLC 0 0 NaN 0 74012 40143 CORPORATION
4 1000054004 ANASTASIA CONFECTIONS, INC. ORLANDO FL 32801 FLORIDA BUS. DEVEL CORP FL 0 1997-02-28 1997 ... 1997 {1997: 1} INC 0 0 NaN 0 32801 12095 NaN

5 rows × 46 columns

In [68]:
nat.BusinessType.value_counts()
Out[68]:
CORPORATION    479336
INDIVIDUAL     196415
PARTNERSHIP     30121
Name: BusinessType, dtype: int64
In [69]:
nat[['BusinessType']].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 897137 entries, 0 to 897136
Data columns (total 1 columns):
BusinessType    705872 non-null object
dtypes: object(1)
memory usage: 13.7+ MB
In [70]:
save_csv(nat, 'SBAnational_new.csv')
<---- Saving csv file to s3 ---->
In [72]:
nat.columns.tolist()
Out[72]:
['LoanNr_ChkDgt',
 'Name',
 'City',
 'State',
 'Zip',
 'Bank',
 'BankState',
 'NAICS',
 'ApprovalDate',
 'ApprovalFY',
 'Term',
 'NoEmp',
 'NewExist',
 'CreateJob',
 'RetainedJob',
 'FranchiseCode',
 'UrbanRural',
 'RevLineCr',
 'LowDoc',
 'ChgOffDate',
 'DisbursementDate',
 'DisbursementGross',
 'BalanceGross',
 'MIS_Status',
 'ChgOffPrinGr',
 'GrAppv',
 'SBA_Appv',
 'default',
 'Zip5d',
 'Zip3d',
 'SBA_ratio',
 'Zip_length',
 'RealEstate',
 'NAICS_default_rate',
 'NAICS_group',
 'Name2',
 'loan_start',
 'loan_record_dict',
 'suffix',
 'Loan_age',
 'Previous_loan',
 'default_record_dict',
 'default_times',
 'zip',
 'fips',
 u'BusinessType']